/*
 * Written by J.T. Conklin <jtc@acorntoolworks.com>
 * Public domain.
 */

#include <machine/asm.h>

#if defined(LIBC_SCCS)
	RCSID("$NetBSD: memchr.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $")
#endif

ENTRY(memchr)
	pushl	%esi
	movl	8(%esp),%eax
	movzbl	12(%esp),%ecx
	movl	16(%esp),%esi

	/*
	 * Align to word boundary.
	 * Consider unrolling loop?
	 */
	testl	%esi,%esi	/* nbytes == 0? */
	je	.Lzero
.Lalign:
	testb	$3,%al
	je	.Lword_aligned
	cmpb	(%eax),%cl
	je	.Ldone
	incl	%eax
	decl	%esi
	jnz	.Lalign
	jmp	.Lzero

.Lword_aligned:
	/* copy char to all bytes in word */
	movb	%cl,%ch
	movl	%ecx,%edx
	sall	$16,%ecx
	orl	%edx,%ecx

	_ALIGN_TEXT
.Lloop:
	cmpl	$3,%esi		/* nbytes > 4 */
	jbe	.Lbyte
	movl	(%eax),%edx
	addl	$4,%eax
	xorl	%ecx,%edx
	subl	$4,%esi
	subl	$0x01010101,%edx
	testl	$0x80808080,%edx
	je	.Lloop

	/*
	 * In rare cases, the above loop may exit prematurely. We must
	 * return to the loop if none of the bytes in the word are
	 * equal to ch.
	 */

	/*
	 * High load-use latency on the Athlon leads to significant
	 * stalls, so we preload the next char as soon as possible
	 * instead of using cmp mem8, reg8.
	 *
	 * Alignment here avoids a stall on the Athlon, even though
	 * it's not a branch target.
	 */
	_ALIGN_TEXT
	cmpb	-4(%eax),%cl	/* 1st byte == ch? */
	movb	-3(%eax),%dl
	jne	1f
	subl	$4,%eax
	jmp	.Ldone

	_ALIGN_TEXT
1:	cmpb	%dl,%cl		/* 2nd byte == ch? */
	movb	-2(%eax),%dl
	jne	1f
	subl	$3,%eax
	jmp	.Ldone

	_ALIGN_TEXT
1:	cmpb	%dl,%cl		/* 3rd byte == ch? */
	movb	-1(%eax),%dl
	jne	1f
	subl	$2,%eax
	jmp	.Ldone

	_ALIGN_TEXT
1:	cmpb	%dl,%cl		/* 4th byte == ch? */
	jne	.Lloop
	decl	%eax
	jmp	.Ldone

.Lbyte:
	testl	%esi,%esi
	je	.Lzero
.Lbyte_loop:
	cmpb	(%eax),%cl
	je	.Ldone
	incl	%eax
	decl	%esi
	jnz	.Lbyte_loop

.Lzero:
	xorl	%eax,%eax

.Ldone:
	popl	%esi
	ret
END(memchr)
